import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import plotly.offline as ply
import plotly.graph_objects as go
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
import statsmodels.api as sm
from sklearn import metrics
import warnings
from IPython.display import Image
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from itertools import combinations
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, log_loss
from sklearn.metrics import confusion_matrix, classification_report
from pandas.plotting import scatter_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV
warnings.filterwarnings('ignore')
Lending institutions all over the world need to classify whether a loan is acceptable (i.e., good) or not (i.e., default) to make their lending decisions. They can do this by relating the outcomes of those loans which were given (1= default, 0= good) to features such as home ownership, annual income, and the debt to income ratio of the credit applicant.
Image(filename='Lending-Club.jpg')
Gain insights on the parameters that affect the default of loan given to credit applicant by LendingClub according to various parameters and build a model that predicts if loan was defaulted
Or Maybe we don't have enough information to classify correctly if loan was defaulted and it depends on more feature like, lender character?
Lending Club is a peer-to-peer lender that allows investors to lend money to borrowers witout intermediary being involved.
#reading loan df
df= pd.read_csv('Lending_Club.csv')
df.columns
df.describe().T
df.info()
dfColumns=[col.strip().upper() for col in df.columns]
df.columns=dfColumns
print("Lending Club DF with NA values:")
print(df.columns[df.isna().any()].tolist())
df.columns
print("Counting NA values per recognized columns with NA:")
print("EMPLOYMENT_LENGTH NA Valus:"+ str(df.EMPLOYMENT_LENGTH.isna().sum()))
df.info()
# filling recordes with A values
df_loan = df.copy()
df_loan.fillna(method='ffill', inplace=True)
# In Addition removing unnecessary column
df_loan=df_loan.drop(["LOAN_ID","BORROWER_ID"], axis=1)
print("Current columns with NA values")
print(df_loan.columns[df_loan.isna().any()].tolist())
#Removing records where is NA
df_loan=df_loan.dropna()
df_loan.info()
df_loan.describe().style.apply(lambda x: ["background: yellow" if v <= 0 else "" for v in x], axis = 1)
print("LOAN_AMOUNT unique values:"+str(df_loan.LOAN_AMOUNT.unique()))
print("LOAN_TERM unique values:"+str(df_loan.LOAN_TERM.unique()))
print("INTEREST_RATE unique values:"+str(df_loan.INTEREST_RATE.unique()))
print("MONTHLY_PAYMENT unique values:"+str(df_loan.MONTHLY_PAYMENT.unique()))
print("LOAN_PURPOSE unique values:"+str(df_loan.LOAN_PURPOSE.unique()))
print("EMPLOYMENT_LENGTH unique values:"+str(df_loan.EMPLOYMENT_LENGTH.unique()))
print("HOUSING unique values:"+str(df_loan.HOUSING.unique()))
print("ANNUAL_INCOME unique values:"+str(df_loan.ANNUAL_INCOME.unique()))
print("DEBT_TO_INCOME unique values:"+str(df_loan.DEBT_TO_INCOME.unique()))
print("DEFAULT unique values:"+str(df_loan.DEFAULT.unique()))
print("LOAN_AMOUNT values count:")
print(df_loan.LOAN_AMOUNT.value_counts())
print("LOAN_TERM values count:")
print(df_loan.LOAN_TERM.value_counts())
print("LOAN_PURPOSE values count:")
print(df_loan.LOAN_PURPOSE.value_counts())
print("EMPLOYMENT_LENGTH values count:")
print(df_loan.EMPLOYMENT_LENGTH.value_counts())
print("HOUSING values count:")
print(df_loan.HOUSING.value_counts())
print("ANNUAL_INCOME values count:")
print(df_loan.ANNUAL_INCOME.value_counts())
print("DEBT_TO_INCOME values count:")
print(df_loan.DEBT_TO_INCOME.value_counts())
print("DEFAULT values count:")
print(df_loan.DEFAULT.value_counts())
for i in range (0, len(df_loan['EMPLOYMENT_LENGTH'])):
if df_loan['EMPLOYMENT_LENGTH'][i] == "< 1 year":
df_loan['EMPLOYMENT_LENGTH'][i]= "< 1 year"
elif df_loan['EMPLOYMENT_LENGTH'][i] == "1 year" or df_loan['EMPLOYMENT_LENGTH'][i] == "2 years":
df_loan['EMPLOYMENT_LENGTH'][i]= "1-2 Years"
elif df_loan['EMPLOYMENT_LENGTH'][i] == "3 years" or df_loan['EMPLOYMENT_LENGTH'][i] == "4 years":
df_loan['EMPLOYMENT_LENGTH'][i]= "3-4 Years"
elif df_loan['EMPLOYMENT_LENGTH'][i] == "5 years" or df_loan['EMPLOYMENT_LENGTH'][i] == "6 years":
df_loan['EMPLOYMENT_LENGTH'][i]= "5-6 Years"
elif df_loan['EMPLOYMENT_LENGTH'][i] == "7 years" or df_loan['EMPLOYMENT_LENGTH'][i] == "8 years":
df_loan['EMPLOYMENT_LENGTH'][i]= "7-8 Years"
elif df_loan['EMPLOYMENT_LENGTH'][i] == "9 years" or df_loan['EMPLOYMENT_LENGTH'][i] == "10 years":
df_loan['EMPLOYMENT_LENGTH'][i]= "9-10 Years"
else:
df_loan['EMPLOYMENT_LENGTH'][i]= ">10 Years"
print("EMPLOYMENT_LENGTH :", df_loan['EMPLOYMENT_LENGTH'].unique())
df_loan.describe().T
left = df_loan.groupby('DEFAULT')
left.mean()
sns.countplot(x='DEFAULT', data=df_loan)
plt.xlabel('Loan Outcome')
plt.ylabel("Number of Loans")
plt.title("Loan Classification")
plt.show()
pd.value_counts(df_loan['DEFAULT'].values,normalize=True)
df_loan['DEFAULT'].value_counts()
df_plot = df_loan['DEFAULT'].value_counts().reset_index()
plot_data = [
go.Bar(
x=df_plot['index'],
y=df_plot['DEFAULT'],
width = [0.5, 0.5],
marker=dict(
color=['blue', 'orange'])
)
]
plot_layout = go.Layout(
xaxis={"type": "category"},
yaxis={"title": "DEFAULT"},
title='DEFAULT',
plot_bgcolor = 'rgb(243,243,243)',
paper_bgcolor = 'rgb(243,243,243)',
)
fig = go.Figure(data=plot_data, layout=plot_layout)
ply.iplot(fig)
There are 22442 no default loans and 2557 default loans in the outcome variables.
count_no_default = len(df_loan[df_loan['DEFAULT']==0])
count_default = len(df_loan[df_loan['DEFAULT']==1])
pct_of_no_default = count_no_default/(count_no_default+count_default)
print("percentage of no default is", pct_of_no_default*100)
pct_of_default = count_default/(count_no_default+count_default)
print("percentage of default", pct_of_default*100)
%matplotlib inline
pd.crosstab(df_loan.HOUSING,df_loan.DEFAULT).plot(kind='bar')
plt.title('Default Frequency for Home Ownership')
plt.xlabel('Home Ownership')
plt.ylabel('Number of Loans')
plt.savefig('default_fre_home')
table=pd.crosstab(df_loan['EMPLOYMENT_LENGTH'],df_loan['DEFAULT'])
table.div(table.sum(1).astype(float), axis=0).plot(kind='bar', stacked=True)
plt.title('Stacked Bar Chart of EMPLOYMENT_LENGTH vs Default')
plt.xlabel('EMPLOYMENT_LENGTH')
plt.ylabel('Proportion of Customers')
plt.savefig('mariral_vs_pur_stack')
%matplotlib inline
import matplotlib
matplotlib.rcParams['figure.figsize'] = (10.0, 10.0)
import matplotlib.pyplot as plt
import seaborn as sns
sns.kdeplot(df_loan['ANNUAL_INCOME'].loc[df_loan['DEFAULT'] == 0], label='not default', shade=True);
sns.kdeplot(df_loan['ANNUAL_INCOME'].loc[df_loan['DEFAULT'] == 1], label='not default', shade=True);
num_projects=df_loan.groupby('EMPLOYMENT_LENGTH').count()
plt.barh(num_projects.index.values, num_projects['LOAN_AMOUNT'], color=['blue', 'orange','green','red','purple','chocolate','pink','grey'])
plt.xlabel('Number of Loans')
plt.ylabel("borrower's Employment Length")
plt.title('Loans Frequency for Employment Length')
plt.show()
num_projects=df_loan.groupby('LOAN_PURPOSE').count()
plt.barh(num_projects.index.values, num_projects['LOAN_AMOUNT'], color=['blue', 'orange','green','red','purple','chocolate','pink','grey'])
plt.xlabel('Number of Loans')
plt.ylabel("borrower's Employment Length")
plt.title('Loans Frequency for Loan purpose')
plt.show()
%matplotlib inline
import matplotlib.pyplot as plt
df_loan.hist(bins=10, figsize=(20,15))
plt.savefig("attribute_histogram_plots")
plt.show()
features2=['LOAN_PURPOSE', 'EMPLOYMENT_LENGTH', 'HOUSING', 'DEFAULT']
fig=plt.subplots(figsize=(30,30))
for i, j in enumerate(features2):
plt.subplot(4, 2, i+1)
plt.subplots_adjust(hspace = 1.0)
sns.countplot(x=j,data = df_loan)
plt.xticks(rotation=90)
plt.title("Number of Loans")
fig=plt.subplots(figsize=(30,30))
for i, j in enumerate(features2):
plt.subplot(4, 2, i+1)
plt.subplots_adjust(hspace = 1.0)
sns.countplot(x=j,data = df_loan, hue='DEFAULT')
plt.xticks(rotation=90)
plt.title("Number of Loans")
df_plot = df_loan.groupby('LOAN_AMOUNT').DEFAULT.mean().reset_index()
plot_data = [
go.Scatter(
x=df_plot['LOAN_AMOUNT'],
y=df_plot['DEFAULT'],
mode='markers',
name='Low',
marker= dict(size= 7,
line= dict(width=1),
color= 'blue',
opacity= 0.8
),
)
]
plot_layout = go.Layout(
yaxis= {'title': "DEFAULT"},
xaxis= {'title': "LOAN_AMOUNT"},
title='LOAN_AMOUNT vs DEFAULT',
plot_bgcolor = "rgb(243,243,243)",
paper_bgcolor = "rgb(243,243,243)",
)
fig = go.Figure(data=plot_data, layout=plot_layout)
ply.iplot(fig)
df_loan.LOAN_AMOUNT.hist(bins=50, figsize=(20,15), color=['green'])
plt.savefig("LOAN_AMOUNT_histogram_plot")
plt.show()
#mean DEFAULT by LOAN_TERM
ax=df_loan.groupby(['LOAN_TERM']).mean().reindex([' 60 months', ' 36 months'])['DEFAULT'].plot.bar(figsize=(12, 5),color=['red','purple'])
ax.set_title('DEFAULT mean LOAN_TERM')
ax.set_ylabel('DEFAULT mean')
df_plot = df_loan.groupby('INTEREST_RATE').DEFAULT.mean().reset_index()
plot_data = [
go.Scatter(
x=df_plot['INTEREST_RATE'],
y=df_plot['DEFAULT'],
mode='markers',
name='Low',
marker= dict(size= 7,
line= dict(width=1),
color= 'blue',
opacity= 0.8
),
)
]
plot_layout = go.Layout(
yaxis= {'title': "DEFAULT"},
xaxis= {'title': "INTEREST_RATE"},
title='INTEREST_RATE vs DEFAULT',
plot_bgcolor = "rgb(243,243,243)",
paper_bgcolor = "rgb(243,243,243)",
)
fig = go.Figure(data=plot_data, layout=plot_layout)
ply.iplot(fig)
df_loan.INTEREST_RATE.hist(bins=50, figsize=(20,15),color=['chocolate'])
plt.savefig("INTEREST_RATE_histogram_plot")
plt.show()
df_plot = df_loan.groupby('MONTHLY_PAYMENT').DEFAULT.mean().reset_index()
plot_data = [
go.Scatter(
x=df_plot['MONTHLY_PAYMENT'],
y=df_plot['DEFAULT'],
mode='markers',
name='Low',
marker= dict(size= 7,
line= dict(width=1),
color= 'blue',
opacity= 0.8
),
)
]
plot_layout = go.Layout(
yaxis= {'title': "DEFAULT"},
xaxis= {'title': "MONTHLY_PAYMENT"},
title='MONTHLY_PAYMENT vs DEFAULT',
plot_bgcolor = "rgb(243,243,243)",
paper_bgcolor = "rgb(243,243,243)",
)
fig = go.Figure(data=plot_data, layout=plot_layout)
ply.iplot(fig)
df_loan.MONTHLY_PAYMENT.hist(bins=50, figsize=(20,15),color=['pink'])
plt.savefig("MONTHLY_PAYMENT_histogram_plot")
plt.show()
#mean DEFAULT by LOAN_PURPOSE
ax=df_loan.groupby(['LOAN_PURPOSE']).mean().reindex(['debt_consolidation', 'credit_card',\
'small_business', 'medical', 'other',\
'vacation', 'house', 'major_purchase',\
'home_improvement', 'wedding', 'car',\
'moving', 'renewable_energy', 'educational'])['DEFAULT'].plot.bar(figsize=(12, 5),color=['blue', 'orange','green','red','purple','chocolate','pink','grey','black','brown','yellow'])
ax.set_title('DEFAULT mean LOAN_PURPOSE')
ax.set_ylabel('DEFAULT mean')
#mean DEFAULT by EMPLOYMENT_LENGTH
ax=df_loan.groupby(['EMPLOYMENT_LENGTH']).mean().reindex(['< 1 year','1-2 Years','3-4 Years',\
'5-6 Years','7-8 Years','9-10 Years',\
'>10 Years'
])['DEFAULT'].plot.bar(figsize=(12, 5),color=['blue', 'orange','green','red','purple','chocolate','pink','grey','black','brown','yellow'])
ax.set_title('DEFAULT mean EMPLOYMENT_LENGTH')
ax.set_ylabel('DEFAULT mean')
#mean DEFAULT by HOUSING
ax=df_loan.groupby(['HOUSING']).mean().reindex(['yes', 'no'])['DEFAULT'].plot.bar(figsize=(12, 5),color=['green','red'])
ax.set_title('DEFAULT mean HOUSING')
ax.set_ylabel('DEFAULT mean')
df_plot = df_loan.groupby('ANNUAL_INCOME').DEFAULT.mean().reset_index()
plot_data = [
go.Scatter(
x=df_plot['ANNUAL_INCOME'],
y=df_plot['DEFAULT'],
mode='markers',
name='Low',
marker= dict(size= 7,
line= dict(width=1),
color= 'blue',
opacity= 0.8
),
)
]
plot_layout = go.Layout(
yaxis= {'title': "DEFAULT"},
xaxis= {'title': "ANNUAL_INCOME"},
title='ANNUAL_INCOME vs DEFAULT',
plot_bgcolor = "rgb(243,243,243)",
paper_bgcolor = "rgb(243,243,243)",
)
fig = go.Figure(data=plot_data, layout=plot_layout)
ply.iplot(fig)
df_loan.ANNUAL_INCOME.hist(bins=50, figsize=(20,15),color=['grey'])
plt.savefig("ANNUAL_INCOME_histogram_plot")
plt.show()
df_plot = df_loan.groupby('DEBT_TO_INCOME').DEFAULT.mean().reset_index()
plot_data = [
go.Scatter(
x=df_plot['DEBT_TO_INCOME'],
y=df_plot['DEFAULT'],
mode='markers',
name='Low',
marker= dict(size= 7,
line= dict(width=1),
color= 'blue',
opacity= 0.8
),
)
]
plot_layout = go.Layout(
yaxis= {'title': "DEFAULT"},
xaxis= {'title': "DEBT_TO_INCOME"},
title='DEBT_TO_INCOME vs DEFAULT',
plot_bgcolor = "rgb(243,243,243)",
paper_bgcolor = "rgb(243,243,243)",
)
fig = go.Figure(data=plot_data, layout=plot_layout)
ply.iplot(fig)
df_loan.DEBT_TO_INCOME.hist(bins=50, figsize=(20,15),color=['yellow'])
plt.savefig("DEBT_TO_INCOME_histogram_plot")
plt.show()
#move down
sns.pairplot(x_vars=['MONTHLY_PAYMENT'], y_vars=['LOAN_AMOUNT'], data=df_loan, hue ="LOAN_TERM", size=5)
plt.title("MONTHLY_PAYMENT vs LOAN_AMOUNT by EMPLOYMENT_LENGTH")
features=['LOAN_AMOUNT', 'LOAN_TERM', 'INTEREST_RATE',
'MONTHLY_PAYMENT', 'LOAN_PURPOSE', 'EMPLOYMENT_LENGTH', 'HOUSING',
'ANNUAL_INCOME', 'DEBT_TO_INCOME', 'DEFAULT']
sns.set_style()
corr = df_loan[features].corr()
sns.heatmap(corr,cmap="RdYlBu",vmin=-1,vmax=1)
plt.title("correlation heat map")
plt.show()
features=['LOAN_AMOUNT', 'LOAN_TERM', 'INTEREST_RATE',
'MONTHLY_PAYMENT', 'LOAN_PURPOSE', 'EMPLOYMENT_LENGTH', 'HOUSING',
'ANNUAL_INCOME', 'DEBT_TO_INCOME', 'DEFAULT']
mask = np.zeros_like(df_loan[features].corr(), dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
f, ax = plt.subplots(figsize=(16, 12))
plt.title('Correlation Matrix',fontsize=25)
sns.heatmap(df_loan[features].corr(),vmax=1.0,square=True,cmap="RdYlBu",
linecolor='w',annot=True,mask=mask,cbar_kws={"shrink": .75})
Now let’s look at how much each independent variable correlates with the DEFAULT.
corr_matrix = df_loan.corr()
corr_matrix["DEFAULT"].sort_values(ascending=False)
attributes = ["DEFAULT", "INTEREST_RATE", "MONTHLY_PAYMENT",'LOAN_AMOUNT','DEBT_TO_INCOME']
scatter_matrix(df_loan[attributes], figsize=(12, 8))
plt.savefig('matrix.png')
Create dummy variables for four categorical variables.
df_loan_dummies=pd.get_dummies(df_loan,columns=['LOAN_TERM','LOAN_PURPOSE','EMPLOYMENT_LENGTH', 'HOUSING'])
df_loan_dummies.head().T
df_loan_dummies.to_csv("df_loan_dummies.csv",index=False)
df_loan_dummies.columns
X = df_loan_dummies.drop("DEFAULT",axis=1)
y = df_loan_dummies["DEFAULT"]
y=y.astype('int')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=47, stratify=y)
df_loan_dummies["DEFAULT"].value_counts()
sns.countplot(x="DEFAULT",data=df_loan_dummies)
plt.show()
count_no_default = len(df_loan_dummies[df_loan_dummies['DEFAULT']==0])
count_default = len(df_loan_dummies[df_loan_dummies['DEFAULT']==1])
pct_of_no_default = count_no_default/(count_no_default+count_default)
print("\033[1m percentage of no default is", pct_of_no_default*100)
pct_of_default = count_default/(count_no_default+count_default)
print("\033[1m percentage of default", pct_of_default*100)
Our classes are imbalanced, and the ratio of no-default to default instances is 90:10.
With our training data created, I’ll up-sample the default using the SMOTE algorithm (Synthetic Minority Oversampling Technique). At a high level, SMOTE:
We are going to implement SMOTE in Python.
os = SMOTE(random_state=47)
columns1 = X.columns
os_data_X,os_data_y=os.fit_sample(X_train, y_train)
os_data_X = pd.DataFrame(data=os_data_X,columns=columns1)
os_data_y= pd.DataFrame(data=os_data_y,columns=['DEFAULT'])
# we can Check the numbers of our data
print("length of oversampled data is ",len(os_data_X))
print("Number of no default in oversampled data",len(os_data_y[os_data_y['DEFAULT']==0]))
print("Number of default",len(os_data_y[os_data_y['DEFAULT']==1]))
print("Proportion of no default data in oversampled data is ",len(os_data_y[os_data_y['DEFAULT']==0])/len(os_data_X))
print("Proportion of default data in oversampled data is ",len(os_data_y[os_data_y['DEFAULT']==1])/len(os_data_X))
cols=columns1
X_train=os_data_X[cols]
y_train=os_data_y['DEFAULT']
X_train.columns
ax=pd.value_counts(y_train.values,normalize=True).plot.bar(figsize=(12, 5),color=['blue','orange'])
ax.set_title('DEFAULT')
ax.set_xticklabels(['DEFAULT=True','NO DEFAULT=False'])
plt.show()
plt.figure(figsize=(25,25))
plt.subplot(1, 2, 1)
plt.title("Data before balancing LOAN_TERM_ 36 months")
pd.value_counts(df_loan_dummies['LOAN_TERM_ 36 months'].values,normalize=True).plot.bar(figsize=(12, 5),color=['blue','darkorange'])
plt.subplot(1, 2, 2)
plt.title("Data after balancing LOAN_TERM_ 36 months")
pd.value_counts(X_train['LOAN_TERM_ 36 months'].values,normalize=True).plot.bar(figsize=(12, 5),color=['blue','darkorange'])
plt.figure(figsize=(25,25))
plt.subplot(1, 2, 1)
plt.title("Data before LOAN_TERM_ 60 months")
pd.value_counts(df_loan_dummies['LOAN_TERM_ 60 months'].values,normalize=True).plot.bar(figsize=(12, 5),color=['blue','darkorange'])
plt.subplot(1, 2, 2)
plt.title("Data after balancing LOAN_TERM_ 60 months")
pd.value_counts(X_train['LOAN_TERM_ 60 months'].values,normalize=True).plot.bar(figsize=(12, 5),color=['blue','darkorange'])
plt.figure(figsize=(25,25))
plt.subplot(1, 2, 1)
plt.title("Data before LOAN_PURPOSE_car")
pd.value_counts(df_loan_dummies['LOAN_PURPOSE_car'].values,normalize=True).plot.bar(figsize=(12, 5),color=['blue','darkorange'])
plt.subplot(1, 2, 2)
plt.title("Data after balancing LOAN_PURPOSE_car")
pd.value_counts(X_train['LOAN_PURPOSE_car'].values,normalize=True).plot.bar(figsize=(12, 5),color=['blue','darkorange'])
plt.figure(figsize=(25,25))
plt.subplot(1, 2, 1)
plt.title("Data before LOAN_PURPOSE_credit_card")
pd.value_counts(df_loan_dummies['LOAN_PURPOSE_credit_card'].values,normalize=True).plot.bar(figsize=(12, 5),color=['blue','darkorange'])
plt.subplot(1, 2, 2)
plt.title("Data after balancing LOAN_PURPOSE_credit_card")
pd.value_counts(X_train['LOAN_PURPOSE_credit_card'].values,normalize=True).plot.bar(figsize=(12, 5),color=['blue','darkorange'])
plt.figure(figsize=(25,25))
plt.subplot(1, 2, 1)
plt.title("Data before LOAN_PURPOSE_debt_consolidation")
pd.value_counts(df_loan_dummies['LOAN_PURPOSE_debt_consolidation'].values,normalize=True).plot.bar(figsize=(12, 5),color=['blue','darkorange'])
plt.subplot(1, 2, 2)
plt.title("Data after balancing LOAN_PURPOSE_debt_consolidation")
pd.value_counts(X_train['LOAN_PURPOSE_debt_consolidation'].values,normalize=True).plot.bar(figsize=(12, 5),color=['blue','darkorange'])
plt.figure(figsize=(25,25))
plt.subplot(1, 2, 1)
plt.title("Data before LOAN_PURPOSE_home_improvement")
pd.value_counts(df_loan_dummies['LOAN_PURPOSE_home_improvement'].values,normalize=True).plot.bar(figsize=(12, 5),color=['blue','darkorange'])
plt.subplot(1, 2, 2)
plt.title("Data after balancing LOAN_PURPOSE_home_improvement")
pd.value_counts(X_train['LOAN_PURPOSE_home_improvement'].values,normalize=True).plot.bar(figsize=(12, 5),color=['blue','darkorange'])
plt.figure(figsize=(25,25))
plt.subplot(1, 2, 1)
plt.title("Data before LOAN_PURPOSE_major_purchase")
pd.value_counts(df_loan_dummies['LOAN_PURPOSE_major_purchase'].values,normalize=True).plot.bar(figsize=(12, 5),color=['blue','darkorange'])
plt.subplot(1, 2, 2)
plt.title("Data after balancing LOAN_PURPOSE_major_purchase")
pd.value_counts(X_train['LOAN_PURPOSE_major_purchase'].values,normalize=True).plot.bar(figsize=(12, 5),color=['blue','darkorange'])
plt.figure(figsize=(25,25))
plt.subplot(1, 2, 1)
plt.title("Data before LOAN_PURPOSE_medical")
pd.value_counts(df_loan_dummies['LOAN_PURPOSE_medical'].values,normalize=True).plot.bar(figsize=(12, 5),color=['blue','darkorange'])
plt.subplot(1, 2, 2)
plt.title("Data after balancing LOAN_PURPOSE_medical")
pd.value_counts(X_train['LOAN_PURPOSE_medical'].values,normalize=True).plot.bar(figsize=(12, 5),color=['blue','darkorange'])
plt.figure(figsize=(25,25))
plt.subplot(1, 2, 1)
plt.title("Data before LOAN_PURPOSE_moving")
pd.value_counts(df_loan_dummies['LOAN_PURPOSE_moving'].values,normalize=True).plot.bar(figsize=(12, 5),color=['blue','darkorange'])
plt.subplot(1, 2, 2)
plt.title("Data after balancing LOAN_PURPOSE_moving")
pd.value_counts(X_train['LOAN_PURPOSE_moving'].values,normalize=True).plot.bar(figsize=(12, 5),color=['blue','darkorange'])
plt.figure(figsize=(25,25))
plt.subplot(1, 2, 1)
plt.title("Data before LOAN_PURPOSE_other")
pd.value_counts(df_loan_dummies['LOAN_PURPOSE_other'].values,normalize=True).plot.bar(figsize=(12, 5),color=['blue','darkorange'])
plt.subplot(1, 2, 2)
plt.title("Data after balancing LOAN_PURPOSE_other")
pd.value_counts(X_train['LOAN_PURPOSE_other'].values,normalize=True).plot.bar(figsize=(12, 5),color=['blue','darkorange'])
plt.figure(figsize=(25,25))
plt.subplot(1, 2, 1)
plt.title("Data before LOAN_PURPOSE_vacation")
pd.value_counts(df_loan_dummies['LOAN_PURPOSE_vacation'].values,normalize=True).plot.bar(figsize=(12, 5),color=['blue','darkorange'])
plt.subplot(1, 2, 2)
plt.title("Data after balancing LOAN_PURPOSE_vacation")
pd.value_counts(X_train['LOAN_PURPOSE_vacation'].values,normalize=True).plot.bar(figsize=(12, 5),color=['blue','darkorange'])
plt.figure(figsize=(25,25))
plt.subplot(1, 2, 1)
plt.title("Data before EMPLOYMENT_LENGTH_1-2 Years")
pd.value_counts(df_loan_dummies['EMPLOYMENT_LENGTH_1-2 Years'].values,normalize=True).plot.bar(figsize=(12, 5),color=['blue','darkorange'])
plt.subplot(1, 2, 2)
plt.title("Data after balancing EMPLOYMENT_LENGTH_1-2 Years")
pd.value_counts(X_train['EMPLOYMENT_LENGTH_1-2 Years'].values,normalize=True).plot.bar(figsize=(12, 5),color=['blue','darkorange'])
plt.figure(figsize=(25,25))
plt.subplot(1, 2, 1)
plt.title("Data before EMPLOYMENT_LENGTH_3-4 Years")
pd.value_counts(df_loan_dummies['EMPLOYMENT_LENGTH_3-4 Years'].values,normalize=True).plot.bar(figsize=(12, 5),color=['blue','darkorange'])
plt.subplot(1, 2, 2)
plt.title("Data after balancing EMPLOYMENT_LENGTH_3-4 Years")
pd.value_counts(X_train['EMPLOYMENT_LENGTH_3-4 Years'].values,normalize=True).plot.bar(figsize=(12, 5),color=['blue','darkorange'])
plt.figure(figsize=(25,25))
plt.subplot(1, 2, 1)
plt.title("Data before EMPLOYMENT_LENGTH_5-6 Years")
pd.value_counts(df_loan_dummies['EMPLOYMENT_LENGTH_5-6 Years'].values,normalize=True).plot.bar(figsize=(12, 5),color=['blue','darkorange'])
plt.subplot(1, 2, 2)
plt.title("Data after balancing EMPLOYMENT_LENGTH_5-6 Years")
pd.value_counts(X_train['EMPLOYMENT_LENGTH_5-6 Years'].values,normalize=True).plot.bar(figsize=(12, 5),color=['blue','darkorange'])
plt.figure(figsize=(25,25))
plt.subplot(1, 2, 1)
plt.title("Data before EMPLOYMENT_LENGTH_7-8 Years")
pd.value_counts(df_loan_dummies['EMPLOYMENT_LENGTH_7-8 Years'].values,normalize=True).plot.bar(figsize=(12, 5),color=['blue','darkorange'])
plt.subplot(1, 2, 2)
plt.title("Data after balancing EMPLOYMENT_LENGTH_7-8 Years")
pd.value_counts(X_train['EMPLOYMENT_LENGTH_7-8 Years'].values,normalize=True).plot.bar(figsize=(12, 5),color=['blue','darkorange'])
plt.figure(figsize=(25,25))
plt.subplot(1, 2, 1)
plt.title("Data before EMPLOYMENT_LENGTH_9-10 Years")
pd.value_counts(df_loan_dummies['EMPLOYMENT_LENGTH_9-10 Years'].values,normalize=True).plot.bar(figsize=(12, 5),color=['blue','darkorange'])
plt.subplot(1, 2, 2)
plt.title("Data after balancing EMPLOYMENT_LENGTH_9-10 Years")
pd.value_counts(X_train['EMPLOYMENT_LENGTH_9-10 Years'].values,normalize=True).plot.bar(figsize=(12, 5),color=['blue','darkorange'])
plt.figure(figsize=(25,25))
plt.subplot(1, 2, 1)
plt.title("Data before EMPLOYMENT_LENGTH_< 1 year")
pd.value_counts(df_loan_dummies['EMPLOYMENT_LENGTH_< 1 year'].values,normalize=True).plot.bar(figsize=(12, 5),color=['blue','darkorange'])
plt.subplot(1, 2, 2)
plt.title("Data after balancing EMPLOYMENT_LENGTH_< 1 year")
pd.value_counts(X_train['EMPLOYMENT_LENGTH_< 1 year'].values,normalize=True).plot.bar(figsize=(12, 5),color=['blue','darkorange'])
plt.figure(figsize=(25,25))
plt.subplot(1, 2, 1)
plt.title("Data before EMPLOYMENT_LENGTH_>10 Years")
pd.value_counts(df_loan_dummies['EMPLOYMENT_LENGTH_>10 Years'].values,normalize=True).plot.bar(figsize=(12, 5),color=['blue','darkorange'])
plt.subplot(1, 2, 2)
plt.title("Data after balancing EMPLOYMENT_LENGTH_>10 Years")
pd.value_counts(X_train['EMPLOYMENT_LENGTH_>10 Years'].values,normalize=True).plot.bar(figsize=(12, 5),color=['blue','darkorange'])
plt.figure(figsize=(25,25))
plt.subplot(1, 2, 1)
plt.title("Data before HOUSING_yes")
pd.value_counts(df_loan_dummies['HOUSING_yes'].values,normalize=True).plot.bar(figsize=(12, 5),color=['blue','darkorange'])
plt.subplot(1, 2, 2)
plt.title("Data after balancing HOUSING_yes")
pd.value_counts(X_train['HOUSING_yes'].values,normalize=True).plot.bar(figsize=(12, 5),color=['blue','darkorange'])
plt.figure(figsize=(25,25))
plt.subplot(1, 2, 1)
plt.title("Data before HOUSING_no")
pd.value_counts(df_loan_dummies['HOUSING_no'].values,normalize=True).plot.bar(figsize=(12, 5),color=['blue','darkorange'])
plt.subplot(1, 2, 2)
plt.title("Data after balancing HOUSING_no")
pd.value_counts(X_train['HOUSING_no'].values,normalize=True).plot.bar(figsize=(12, 5),color=['blue','darkorange'])
relevantColumns=['LOAN_TERM_ 36 months', 'LOAN_TERM_ 60 months', 'LOAN_PURPOSE_car', 'LOAN_PURPOSE_credit_card', 'LOAN_PURPOSE_debt_consolidation', 'LOAN_PURPOSE_home_improvement', 'LOAN_PURPOSE_major_purchase', 'LOAN_PURPOSE_medical', 'LOAN_PURPOSE_moving', 'LOAN_PURPOSE_other', 'LOAN_PURPOSE_vacation', 'EMPLOYMENT_LENGTH_1-2 Years', 'EMPLOYMENT_LENGTH_3-4 Years', 'EMPLOYMENT_LENGTH_5-6 Years', 'EMPLOYMENT_LENGTH_7-8 Years', 'EMPLOYMENT_LENGTH_9-10 Years', 'EMPLOYMENT_LENGTH_< 1 year', 'EMPLOYMENT_LENGTH_>10 Years', 'HOUSING_no', 'HOUSING_yes','DEFAULT']
plt.figure(figsize=(10,10))
sns.set_style()
corr = df_loan_dummies[relevantColumns].corr()
sns.heatmap(corr,cmap="RdYlBu",vmin=-1,vmax=1)
plt.title("correlation heat map")
We will create a SyntheticGridSearch for the K-Nearest Neighbors Classifier in order to find the best hyperparameter for this algorithm
knn_roc = []
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_auc_score
for i in range (1,21):
knn = KNeighborsClassifier(n_neighbors=i)
knn.fit(X_train, y_train)
y_pred1 = knn.predict(X_test)
roc1 = roc_auc_score(y_test, y_pred1)
print(i, roc1)
knn_roc.append(roc1)
print(max(knn_roc))
We will refit an estimator using the best found parameter {'n_neighbors': 17}
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=17)
knn.fit(X_train, y_train)
print('\n accuracy:')
print('---------------')
acc1=knn.score(X_test, y_test)
print(acc1)
y_pred1 = knn.predict(X_test)
from sklearn.metrics import confusion_matrix
cm1 = confusion_matrix(y_true=y_test, y_pred=y_pred1)
cmDf1=pd.DataFrame(cm1, index=knn.classes_, columns=knn.classes_)
print('\nConfusion Matrix')
print('-------------------')
print(cmDf1)
print("\033[1m The result is telling us that we have: ",(cm1[0,0]+cm1[1,1]),"correct predictions.")
print("\033[1m The result is telling us that we have: ",(cm1[0,1]+cm1[1,0]),"incorrect predictions.")
print("\033[1m We have a total predictions of: ",(cm1.sum()))
from sklearn.metrics import classification_report
print('\nclassification_report')
print('------------------------')
print(classification_report(y_test, y_pred1))
per1=metrics.precision_score(y_test, y_pred1)
rec1=metrics.recall_score(y_test, y_pred1)
print("\033[1m Precision of the model:", "{:.2%}".format(per1))
print("\033[1m Recall of the model:", "{:.2%}".format(rec1))
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
roc1 = roc_auc_score(y_test, y_pred1)
fpr, tpr, thresholds = roc_curve(y_test, knn.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='K-Nearest Neighbors (area = %0.4f)' % roc1)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()
print("\033[1m The ROC AUC score using the K-Nearest Neighbors algorithm is:", "{:.4%}".format(roc1))
We will create a SyntheticGridSearch for the Logistic Regression in order to find the best hyperparameter for this algorithm
LR_roc = []
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
C = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
for i in C:
logreg = LogisticRegression(C=i, random_state=47)
logreg.fit(X_train, y_train)
y_pred1 = logreg.predict(X_test)
roc1 = roc_auc_score(y_test, y_pred1)
print(i, roc1)
LR_roc.append(roc1)
print(max(LR_roc))
We will refit an estimator using the best found parameter {'C': 1}
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(C=1, random_state=47)
logreg.fit(X_train, y_train)
print('\n accuracy:')
print('---------------')
acc2=logreg.score(X_test, y_test)
print(acc2)
y_pred2 = logreg.predict(X_test)
from sklearn.metrics import confusion_matrix
cm2 = confusion_matrix(y_true=y_test, y_pred=y_pred2)
cmDf2=pd.DataFrame(cm2, index=logreg.classes_, columns=logreg.classes_)
print('\nConfusion Matrix')
print('-------------------')
print(cmDf2)
print("\033[1m The result is telling us that we have: ",(cm2[0,0]+cm2[1,1]),"correct predictions.")
print("\033[1m The result is telling us that we have: ",(cm2[0,1]+cm2[1,0]),"incorrect predictions.")
print("\033[1m We have a total predictions of: ",(cm2.sum()))
from sklearn.metrics import classification_report
print('\nclassification_report')
print('------------------------')
print(classification_report(y_test, y_pred2))
per2=metrics.precision_score(y_test, y_pred2)
rec2=metrics.recall_score(y_test, y_pred2)
print("\033[1m Precision of the model:", "{:.2%}".format(per2))
print("\033[1m Recall of the model:", "{:.2%}".format(rec2))
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
roc2 = roc_auc_score(y_test, y_pred2)
fpr, tpr, thresholds = roc_curve(y_test, logreg.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.4f)' % roc2)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()
print("\033[1m The ROC AUC score using the Logistic Regression algorithm is:", "{:.4%}".format(roc2))
We have created a GridSearch for the Support Vector Machine in order to find the best hyperparameters for this algorithm. We have found estimator using the best found parameters. {'C': 0.1, 'gamma':0.001}
from sklearn.svm import SVC
svc = SVC(probability=True, C=0.1, gamma=0.001, random_state=47)
svc.fit(X_train, y_train)
print('\n accuracy:')
print('---------------')
acc3=svc.score(X_test, y_test)
print(acc3)
y_pred3 = svc.predict(X_test)
from sklearn.metrics import confusion_matrix
cm3 = confusion_matrix(y_true=y_test, y_pred=y_pred3)
cmDf3=pd.DataFrame(cm3, index=svc.classes_, columns=svc.classes_)
print('\nConfusion Matrix')
print('-------------------')
print(cmDf3)
print("\033[1m The result is telling us that we have: ",(cm3[0,0]+cm3[1,1]),"correct predictions.")
print("\033[1m The result is telling us that we have: ",(cm3[0,1]+cm3[1,0]),"incorrect predictions.")
print("\033[1m We have a total predictions of: ",(cm3.sum()))
from sklearn.metrics import classification_report
print('\nclassification_report')
print('------------------------')
print(classification_report(y_test, y_pred3))
per3=metrics.precision_score(y_test, y_pred3)
rec3=metrics.recall_score(y_test, y_pred3)
print("\033[1m Precision of the model:", "{:.2%}".format(per3))
print("\033[1m Recall of the model:", "{:.2%}".format(rec3))
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
roc3 = roc_auc_score(y_test, y_pred3)
fpr, tpr, thresholds = roc_curve(y_test, svc.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Support Vector Machine (area = %0.4f)' % roc3)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()
print("\033[1m The ROC AUC score using the Support Vector Machine algorithm is:", "{:.4%}".format(roc3))
We will create a SyntheticGridSearch for the Decision Trees in order to find the best hyperparameter of this algorithm
DT_roc = []
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score
for i in range (1,21):
dt = DecisionTreeClassifier(max_depth=i, random_state=47)
dt.fit(X_train, y_train)
y_pred1 = dt.predict(X_test)
roc1 = roc_auc_score(y_test, y_pred1)
print(i, roc1)
DT_roc.append(roc1)
print(max(DT_roc))
We will refit an estimator using the best found parameter {'max_depth': 1}
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score
dt = DecisionTreeClassifier(max_depth=1, random_state=47)
dt.fit(X_train, y_train)
print('\n accuracy:')
print('---------------')
acc4=dt.score(X_test, y_test)
print(acc4)
y_pred4 = dt.predict(X_test)
from sklearn.metrics import confusion_matrix
cm4 = confusion_matrix(y_true=y_test, y_pred=y_pred4)
cmDf4=pd.DataFrame(cm4, index=dt.classes_, columns=dt.classes_)
print('\nConfusion Matrix')
print('-------------------')
print(cmDf4)
print("\033[1m The result is telling us that we have: ",(cm4[0,0]+cm4[1,1]),"correct predictions.")
print("\033[1m The result is telling us that we have: ",(cm4[0,1]+cm4[1,0]),"incorrect predictions.")
print("\033[1m We have a total predictions of: ",(cm4.sum()))
from sklearn.metrics import classification_report
print('\nclassification_report')
print('------------------------')
print(classification_report(y_test, y_pred4))
per4=metrics.precision_score(y_test, y_pred4)
rec4=metrics.recall_score(y_test, y_pred4)
print("\033[1m Precision of the model:", "{:.2%}".format(per4))
print("\033[1m Recall of the model:", "{:.2%}".format(rec4))
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
roc4 = roc_auc_score(y_test, y_pred4)
fpr, tpr, thresholds = roc_curve(y_test, dt.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Decision Trees (area = %0.4f)' % roc4)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()
print("\033[1m The ROC AUC score using the Decision Trees algorithm is:", "{:.4%}".format(roc4))
We will create a SyntheticGridSearch for the Random Forest in order to find the best hyperparameter of this algorithm
RF_roc = []
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
for i in range (1,21):
rf = RandomForestClassifier(max_depth=i, random_state=47)
rf.fit(X_train, y_train)
y_pred1 = rf.predict(X_test)
roc1 = roc_auc_score(y_test, y_pred1)
print(i, roc1)
RF_roc.append(roc1)
print(max(RF_roc))
We will refit an estimator using the best found parameter {'max_depth': 1}
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(max_depth=1, random_state=47)
rf.fit(X_train, y_train)
print('\n accuracy:')
print('---------------')
acc5=rf.score(X_test, y_test)
print(acc5)
y_pred5 = rf.predict(X_test)
from sklearn.metrics import confusion_matrix
cm5 = confusion_matrix(y_true=y_test, y_pred=y_pred5)
cmDf5=pd.DataFrame(cm5, index=rf.classes_, columns=rf.classes_)
print('\nConfusion Matrix')
print('-------------------')
print(cmDf5)
print("\033[1m The result is telling us that we have: ",(cm5[0,0]+cm5[1,1]),"correct predictions.")
print("\033[1m The result is telling us that we have: ",(cm5[0,1]+cm5[1,0]),"incorrect predictions.")
print("\033[1m We have a total predictions of: ",(cm5.sum()))
from sklearn.metrics import classification_report
print('\nclassification_report')
print('------------------------')
print(classification_report(y_test, y_pred5))
per5=metrics.precision_score(y_test, y_pred5)
rec5=metrics.recall_score(y_test, y_pred5)
print("\033[1m Precision of the model:", "{:.2%}".format(per5))
print("\033[1m Recall of the model:", "{:.2%}".format(rec5))
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
roc5 = roc_auc_score(y_test, y_pred5)
fpr, tpr, thresholds = roc_curve(y_test, rf.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Random Forest (area = %0.4f)' % roc5)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()
print("\033[1m The ROC AUC score using the Random Forest algorithm is:", "{:.4%}".format(roc5))
import sklearn.naive_bayes as nb
nbc = nb.GaussianNB()
nbc.fit(X_train, y_train)
print('\n accuracy:')
print('---------------')
acc6=nbc.score(X_test, y_test)
print(acc6)
y_pred6 = nbc.predict(X_test)
from sklearn.metrics import confusion_matrix
cm6 = confusion_matrix(y_true=y_test, y_pred=y_pred6)
cmDf6=pd.DataFrame(cm6, index=nbc.classes_, columns=nbc.classes_)
print('\nConfusion Matrix')
print('-------------------')
print(cmDf6)
print("\033[1m The result is telling us that we have: ",(cm6[0,0]+cm6[1,1]),"correct predictions.")
print("\033[1m The result is telling us that we have: ",(cm6[0,1]+cm6[1,0]),"incorrect predictions.")
print("\033[1m We have a total predictions of: ",(cm6.sum()))
from sklearn.metrics import classification_report
print('\nclassification_report')
print('------------------------')
print(classification_report(y_test, y_pred6))
per6=metrics.precision_score(y_test, y_pred6)
rec6=metrics.recall_score(y_test, y_pred6)
print("\033[1m Precision of the model:", "{:.2%}".format(per6))
print("\033[1m Recall of the model:", "{:.2%}".format(rec6))
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
roc6 = roc_auc_score(y_test, y_pred6)
fpr, tpr, thresholds = roc_curve(y_test, nbc.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Naïve Bayes (area = %0.4f)' % roc6)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()
print("\033[1m The ROC AUC score using the Naïve Bayes algorithm is:", "{:.4%}".format(roc6))
We will create a SyntheticGridSearch for the Extra Trees in order to find the best hyperparameter of this algorithm
ET_roc = []
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import roc_auc_score
for i in range (1,21):
et = ExtraTreesClassifier(max_depth=i, random_state=47)
et.fit(X_train, y_train)
y_pred1 = et.predict(X_test)
roc1 = roc_auc_score(y_test, y_pred1)
print(i, roc1)
ET_roc.append(roc1)
print(max(ET_roc))
We will refit an estimator using the best found parameter {'max_depth': 2}
from sklearn.ensemble import ExtraTreesClassifier
et = ExtraTreesClassifier(max_depth=2, random_state=47)
et.fit(X_train, y_train)
print('\n accuracy:')
print('---------------')
acc7=et.score(X_test, y_test)
print(acc7)
y_pred7 = et.predict(X_test)
from sklearn.metrics import confusion_matrix
cm7 = confusion_matrix(y_true=y_test, y_pred=y_pred7)
cmDf7=pd.DataFrame(cm7, index=et.classes_, columns=et.classes_)
print('\nConfusion Matrix')
print('-------------------')
print(cmDf7)
print("\033[1m The result is telling us that we have: ",(cm7[0,0]+cm7[1,1]),"correct predictions.")
print("\033[1m The result is telling us that we have: ",(cm7[0,1]+cm7[1,0]),"incorrect predictions.")
print("\033[1m We have a total predictions of: ",(cm7.sum()))
from sklearn.metrics import classification_report
print('\nclassification_report')
print('------------------------')
print(classification_report(y_test, y_pred7))
per7=metrics.precision_score(y_test, y_pred7)
rec7=metrics.recall_score(y_test, y_pred7)
print("\033[1m Precision of the model:", "{:.2%}".format(per7))
print("\033[1m Recall of the model:", "{:.2%}".format(rec7))
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
roc7 = roc_auc_score(y_test, y_pred7)
fpr, tpr, thresholds = roc_curve(y_test, et.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Extra Trees (area = %0.4f)' % roc7)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()
print("\033[1m The ROC AUC score using the Extra Trees Classifier algorithm is:", "{:.4%}".format(roc7))
We will create a SyntheticGridSearch for the Gradient Boosting in order to find the best hyperparameter of this algorithm
GB_roc = []
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score
for i in range (1,21):
gb = GradientBoostingClassifier(max_depth=i, random_state=47)
gb.fit(X_train, y_train)
y_pred1 = gb.predict(X_test)
roc1 = roc_auc_score(y_test, y_pred1)
print(i, roc1)
GB_roc.append(roc1)
print(max(GB_roc))
We will refit an estimator using the best found parameter {'max_depth': 1}
from sklearn.ensemble import GradientBoostingClassifier
gb = GradientBoostingClassifier(max_depth=1, random_state=47)
gb.fit(X_train, y_train)
print('\n accuracy:')
print('---------------')
acc8=gb.score(X_test, y_test)
print(acc8)
y_pred8 = gb.predict(X_test)
from sklearn.metrics import confusion_matrix
cm8 = confusion_matrix(y_true=y_test, y_pred=y_pred8)
cmDf8=pd.DataFrame(cm8, index=gb.classes_, columns=gb.classes_)
print('\nConfusion Matrix')
print('-------------------')
print(cmDf8)
print("\033[1m The result is telling us that we have: ",(cm8[0,0]+cm8[1,1]),"correct predictions.")
print("\033[1m The result is telling us that we have: ",(cm8[0,1]+cm8[1,0]),"incorrect predictions.")
print("\033[1m We have a total predictions of: ",(cm8.sum()))
from sklearn.metrics import classification_report
print('\nclassification_report')
print('------------------------')
print(classification_report(y_test, y_pred8))
per8=metrics.precision_score(y_test, y_pred8)
rec8=metrics.recall_score(y_test, y_pred8)
print("\033[1m Precision of the model:", "{:.2%}".format(per8))
print("\033[1m Recall of the model:", "{:.2%}".format(rec8))
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
roc8 = roc_auc_score(y_test, y_pred8)
fpr, tpr, thresholds = roc_curve(y_test, gb.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Gradient Boosting (area = %0.4f)' % roc8)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()
print("\033[1m The ROC AUC score using the Gradient Boosting Classifier algorithm is:", "{:.4%}".format(roc8))
We will create a SyntheticGridSearch for the AdaBoost in order to find the best hyperparameter of this algorithm
Ada_roc = []
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import roc_auc_score
num_estimators = [1, 10, 20, 40, 60, 80, 100]
for i in num_estimators:
ad = AdaBoostClassifier(n_estimators=i, random_state=47)
ad.fit(X_train, y_train)
y_pred1 = ad.predict(X_test)
roc1 = roc_auc_score(y_test, y_pred1)
print(i, roc1)
Ada_roc.append(roc1)
print(max(Ada_roc))
We will refit an estimator using the best found parameter {'n_estimators': 1}
from sklearn.ensemble import AdaBoostClassifier
ad = AdaBoostClassifier(n_estimators=1, random_state=47)
ad.fit(X_train, y_train)
print('\n accuracy:')
print('---------------')
acc9=ad.score(X_test, y_test)
print(acc9)
y_pred9 = ad.predict(X_test)
from sklearn.metrics import confusion_matrix
cm9 = confusion_matrix(y_true=y_test, y_pred=y_pred9)
cmDf9=pd.DataFrame(cm9, index=ad.classes_, columns=ad.classes_)
print('\nConfusion Matrix')
print('-------------------')
print(cmDf9)
print("\033[1m The result is telling us that we have: ",(cm9[0,0]+cm9[1,1]),"correct predictions.")
print("\033[1m The result is telling us that we have: ",(cm9[0,1]+cm9[1,0]),"incorrect predictions.")
print("\033[1m We have a total predictions of: ",(cm9.sum()))
from sklearn.metrics import classification_report
print('\nclassification_report')
print('------------------------')
print(classification_report(y_test, y_pred9))
per9=metrics.precision_score(y_test, y_pred9)
rec9=metrics.recall_score(y_test, y_pred9)
print("\033[1m Precision of the model:", "{:.2%}".format(per9))
print("\033[1m Recall of the model:", "{:.2%}".format(rec9))
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
roc9 = roc_auc_score(y_test, y_pred9)
fpr, tpr, thresholds = roc_curve(y_test, ad.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='AdaBoost (area = %0.4f)' % roc9)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()
print("\033[1m The ROC AUC score using the AdaBoost Classifier algorithm is:", "{:.4%}".format(roc9))
We will create a SyntheticGridSearch for the XGBoost in order to find the best hyperparameter of this algorithm
XG_roc = []
import re
regex = re.compile(r"\[|\]|<", re.IGNORECASE)
X_train.columns = [regex.sub("_", col) if any(x in str(col) for x in set(('[', ']', '<'))) else col for col in X_train.columns.values]
X_test.columns = [regex.sub("_", col) if any(x in str(col) for x in set(('[', ']', '<'))) else col for col in X_test.columns.values]
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
for i in range(1,21):
xg = XGBClassifier(max_depth=i, random_state=47)
xg.fit(X_train, y_train)
y_pred1 = xg.predict(X_test)
roc1 = roc_auc_score(y_test, y_pred1)
print(i, roc1)
XG_roc.append(roc1)
print(max(XG_roc))
We will refit an estimator using the best found parameter {'max_depth': 8}
import re
regex = re.compile(r"\[|\]|<", re.IGNORECASE)
X_train.columns = [regex.sub("_", col) if any(x in str(col) for x in set(('[', ']', '<'))) else col for col in X_train.columns.values]
X_test.columns = [regex.sub("_", col) if any(x in str(col) for x in set(('[', ']', '<'))) else col for col in X_test.columns.values]
from xgboost import XGBClassifier
xg = XGBClassifier(max_depth=8, random_state=47)
xg.fit(X_train, y_train)
print('\n accuracy:')
print('---------------')
acc10=xg.score(X_test, y_test)
print(acc10)
y_pred10 = xg.predict(X_test)
from sklearn.metrics import confusion_matrix
cm10 = confusion_matrix(y_true=y_test, y_pred=y_pred10)
cmDf10=pd.DataFrame(cm10, index=xg.classes_, columns=xg.classes_)
print('\nConfusion Matrix')
print('-------------------')
print(cmDf10)
print("\033[1m The result is telling us that we have: ",(cm10[0,0]+cm10[1,1]),"correct predictions.")
print("\033[1m The result is telling us that we have: ",(cm10[0,1]+cm10[1,0]),"incorrect predictions.")
print("\033[1m We have a total predictions of: ",(cm10.sum()))
from sklearn.metrics import classification_report
print('\nclassification_report')
print('------------------------')
print(classification_report(y_test, y_pred10))
per10=metrics.precision_score(y_test, y_pred10)
rec10=metrics.recall_score(y_test, y_pred10)
print("\033[1m Precision of the model:", "{:.2%}".format(per10))
print("\033[1m Recall of the model:", "{:.2%}".format(rec10))
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
roc10 = roc_auc_score(y_test, y_pred10)
fpr, tpr, thresholds = roc_curve(y_test, xg.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='XGBoost (area = %0.4f)' % roc10)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()
print("\033[1m The ROC AUC score using the XGBoost Classifier algorithm is:", "{:.4%}".format(roc10))
models = ['K-Nearest Neighbors','Logistic Regression',
'Support Vector Machine Classifier', 'Decision Tree Classifier',
'Random Forest Classifier', 'Naïve Bayes Classifier',
'Extra Trees Classifier', 'Gradient Boosting Classifier',
'AdaBoost Classifier', 'XGBoost Boosting Classifier']
tests_roc = [roc1, roc2, roc3, roc4, roc5, roc6, roc7, roc8, roc9, roc10]
compare_models = pd.DataFrame({ "Algorithms": models, "ROC AUC": tests_roc})
compare_models.sort_values(by = "ROC AUC", ascending = False)
import matplotlib.pyplot as plt
%matplotlib inline
plt.figure(figsize=(8,8))
sns.barplot(x = "ROC AUC", y = "Algorithms", data = compare_models)
plt.show()
default_features = [x for i,x in enumerate(cols) if i!=30]
def plot_feature_importances_default(model):
plt.figure(figsize=(10,10))
n_features = len(cols)
plt.barh(range(n_features), model.feature_importances_, align='center')
plt.yticks(np.arange(n_features), default_features)
plt.xlabel("Feature importance")
plt.ylabel("Feature")
plt.ylim(-1, n_features)
plot_feature_importances_default(dt)
plt.savefig('feature_importance')
ET_clf = et
RF_clf = rf
DT_clf = dt
ET_clf.fit(X_train, y_train)
RF_clf.fit(X_train, y_train)
DT_clf.fit(X_train, y_train)
ET_pred = ET_clf.predict(X_test)
RF_pred = RF_clf.predict(X_test)
DT_pred = DT_clf.predict(X_test)
from sklearn.metrics import accuracy_score
averaged_preds = (ET_pred + RF_pred + DT_pred)//3
print('\n accuracy:')
print('---------------')
acc11=accuracy_score(y_test,averaged_preds)
print(acc11)
averaged_preds = (ET_pred + RF_pred + DT_pred)//3
from sklearn.metrics import confusion_matrix
cm11 = confusion_matrix(y_true=y_test, y_pred=averaged_preds)
cmDf11=pd.DataFrame(cm11, index=DT_clf.classes_, columns=DT_clf.classes_)
print('\nConfusion Matrix')
print('-------------------')
print(cmDf11)
print("\033[1m The result is telling us that we have: ",(cm11[0,0]+cm11[1,1]),"correct predictions.")
print("\033[1m The result is telling us that we have: ",(cm11[0,1]+cm11[1,0]),"incorrect predictions.")
print("\033[1m We have a total predictions of: ",(cm11.sum()))
from sklearn.metrics import classification_report
print('\nclassification_report')
print('------------------------')
print(classification_report(y_test, averaged_preds))
per11=metrics.precision_score(y_test, averaged_preds)
rec11=metrics.recall_score(y_test, averaged_preds)
print("\033[1m Precision of the model:", "{:.2%}".format(per11))
print("\033[1m Recall of the model:", "{:.2%}".format(rec11))
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
roc11 = roc_auc_score(y_test, averaged_preds)
average_proba = (ET_clf.predict_proba(X_test)[:,1]+ RF_clf.predict_proba(X_test)[:,1]+ DT_clf.predict_proba(X_test)[:,1])//3
fpr, tpr, thresholds = roc_curve(y_test, average_proba)
plt.figure()
plt.plot(fpr, tpr, label='Simple Averaging Approach (area = %0.4f)' % roc11)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()
print("\033[1m The ROC AUC score using the Simple Averaging Approach is:", "{:.4%}".format(roc11))
voting_clf = VotingClassifier(estimators=[('ET', ET_clf), ('RF', RF_clf), ('DT', DT_clf)], voting='soft')
voting_clf.fit(X_train, y_train)
print('\n accuracy:')
print('---------------')
acc12=voting_clf.score(X_test, y_test)
print(acc12)
voting_preds = voting_clf.predict(X_test)
from sklearn.metrics import confusion_matrix
cm12 = confusion_matrix(y_true=y_test, y_pred=voting_preds)
cmDf12=pd.DataFrame(cm12, index=voting_clf.classes_, columns=voting_clf.classes_)
print('\nConfusion Matrix')
print('-------------------')
print(cmDf12)
print("\033[1m The result is telling us that we have: ",(cm12[0,0]+cm12[1,1]),"correct predictions.")
print("\033[1m The result is telling us that we have: ",(cm12[0,1]+cm12[1,0]),"incorrect predictions.")
print("\033[1m We have a total predictions of: ",(cm12.sum()))
from sklearn.metrics import classification_report
print('\nclassification_report')
print('------------------------')
print(classification_report(y_test, voting_preds))
per12=metrics.precision_score(y_test, voting_preds)
rec12=metrics.recall_score(y_test, voting_preds)
print("\033[1m Precision of the model:", "{:.2%}".format(per12))
print("\033[1m Recall of the model:", "{:.2%}".format(rec12))
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
roc12 = roc_auc_score(y_test, voting_preds)
fpr, tpr, thresholds = roc_curve(y_test, voting_clf.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Voting\Stacking Classification (area = %0.4f)' % roc12)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()
print("\033[1m The ROC AUC score using the Voting/Stacking Classification is:", "{:.4%}".format(roc12))
We will create a SyntheticGridSearch for the Bagging Classification in order to find the best hyperparameter of this algorithm
Bagg_roc = []
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import roc_auc_score
num_estimators = [1, 10, 20, 40, 60, 80, 100]
for i in num_estimators:
DT_Bagg = BaggingClassifier(base_estimator=DT_clf, n_estimators=i, random_state=47)
RF_Bagg = BaggingClassifier(base_estimator=RF_clf, n_estimators=i, random_state=47)
ET_Bagg = BaggingClassifier(base_estimator=ET_clf, n_estimators=i, random_state=47)
DT_Bagg.fit(X_train, y_train)
RF_Bagg.fit(X_train, y_train)
ET_Bagg.fit(X_train, y_train)
voting_Bagg = VotingClassifier(estimators=[('ET', ET_Bagg), ('RF', RF_Bagg), ('DT', DT_Bagg)], voting='soft')
voting_Bagg.fit(X_train, y_train)
y_pred1 = voting_Bagg.predict(X_test)
roc1 = roc_auc_score(y_test, y_pred1)
print(i, roc1)
Bagg_roc.append(roc1)
print(max(Bagg_roc))
We will refit an estimator using the best found parameter {'n_estimators': 20}
from sklearn.ensemble import BaggingClassifier
DT_Bagg = BaggingClassifier(base_estimator=DT_clf, n_estimators=20, random_state=47)
RF_Bagg = BaggingClassifier(base_estimator=RF_clf, n_estimators=20, random_state=47)
ET_Bagg = BaggingClassifier(base_estimator=ET_clf, n_estimators=20, random_state=47)
DT_Bagg.fit(X_train, y_train)
RF_Bagg.fit(X_train, y_train)
ET_Bagg.fit(X_train, y_train)
voting_Bagg = VotingClassifier(estimators=[('ET', ET_Bagg), ('RF', RF_Bagg), ('DT', DT_Bagg)], voting='soft')
voting_Bagg.fit(X_train, y_train)
print('\n accuracy:')
print('---------------')
acc13=voting_Bagg.score(X_test, y_test)
print(acc13)
voting_preds1 = voting_Bagg.predict(X_test)
from sklearn.metrics import confusion_matrix
cm13 = confusion_matrix(y_true=y_test, y_pred=voting_preds1)
cmDf13=pd.DataFrame(cm13, index=voting_Bagg.classes_, columns=voting_Bagg.classes_)
print('\nConfusion Matrix')
print('-------------------')
print(cmDf13)
print("\033[1m The result is telling us that we have: ",(cm13[0,0]+cm13[1,1]),"correct predictions.")
print("\033[1m The result is telling us that we have: ",(cm13[0,1]+cm13[1,0]),"incorrect predictions.")
print("\033[1m We have a total predictions of: ",(cm13.sum()))
from sklearn.metrics import classification_report
print('\nclassification_report')
print('------------------------')
print(classification_report(y_test, voting_preds1))
per13=metrics.precision_score(y_test, voting_preds1)
rec13=metrics.recall_score(y_test, voting_preds1)
print("\033[1m Precision of the model:", "{:.2%}".format(per13))
print("\033[1m Recall of the model:", "{:.2%}".format(rec13))
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
roc13 = roc_auc_score(y_test, voting_preds1)
fpr, tpr, thresholds = roc_curve(y_test, voting_Bagg.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Bagging Classification (area = %0.4f)' % roc13)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()
print("\033[1m The ROC AUC score using the Bagging Classification is:", "{:.4%}".format(roc13))
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
import re
regex = re.compile(r"\[|\]|<", re.IGNORECASE)
X_train.columns = [regex.sub("_", col) if any(x in str(col) for x in set(('[', ']', '<'))) else col for col in X_train.columns.values]
X_test.columns = [regex.sub("_", col) if any(x in str(col) for x in set(('[', ']', '<'))) else col for col in X_test.columns.values]
from sklearn.ensemble import BaggingClassifier
Ada_clf = ad
XG_clf = xg
Grad_clf = gb
Ada_clf.fit(X_train, y_train)
XG_clf.fit(X_train, y_train)
Grad_clf.fit(X_train, y_train)
voting_Boost = VotingClassifier(estimators=[('Grad', Grad_clf), ('XG', XG_clf), ('Ada', Ada_clf)], voting='soft')
voting_Boost.fit(X_train, y_train)
print('\n accuracy:')
print('---------------')
acc14=voting_Boost.score(X_test, y_test)
print(acc14)
voting_preds2 = voting_Boost.predict(X_test)
from sklearn.metrics import confusion_matrix
cm14 = confusion_matrix(y_test, voting_preds2)
cmDf14=pd.DataFrame(cm14, index=voting_Boost.classes_, columns=voting_Boost.classes_)
print('\nConfusion Matrix')
print('-------------------')
print(cmDf14)
print("\033[1m The result is telling us that we have: ",(cm14[0,0]+cm14[1,1]),"correct predictions.")
print("\033[1m The result is telling us that we have: ",(cm14[0,1]+cm14[1,0]),"incorrect predictions.")
print("\033[1m We have a total predictions of: ",(cm14.sum()))
from sklearn.metrics import classification_report
print('\nclassification_report')
print('------------------------')
print(classification_report(y_test, voting_preds2))
per14=metrics.precision_score(y_test, voting_preds2)
rec14=metrics.recall_score(y_test, voting_preds2)
print("\033[1m Precision of the model:", "{:.2%}".format(per14))
print("\033[1m Recall of the model:", "{:.2%}".format(rec14))
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
roc14 = roc_auc_score(y_test, voting_preds2)
fpr, tpr, thresholds = roc_curve(y_test, voting_Boost.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Boosting Classification (area = %0.4f)' % roc14)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()
print("\033[1m The ROC AUC score using the Boosting Classification is:", "{:.4%}".format(roc14))
methods = ['Simple Averaging Approach','Voting/Stacking Classification',
'Bagging Classification','Boosting Classification']
tests_roc = [roc11, roc12, roc13, roc14]
compare_models = pd.DataFrame({ "Methods": methods, "ROC AUC": tests_roc})
compare_models.sort_values(by = "ROC AUC", ascending = False)
import matplotlib.pyplot as plt
%matplotlib inline
plt.figure(figsize=(8,8))
sns.barplot(x = "ROC AUC", y = "Methods", data = compare_models)
plt.show()
We will create a SyntheticGridSearch for the Multi-layer Perceptron in order to find the best hyperparameter of this algorithm
MLP_roc = []
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import roc_auc_score
max_iter= [1, 10, 20, 40, 60, 80, 100]
for i in max_iter:
mlp = MLPClassifier(max_iter=i, random_state=47)
mlp.fit(X_train, y_train)
y_pred1 = mlp.predict(X_test)
roc1 = roc_auc_score(y_test, y_pred1)
print(i, roc1)
MLP_roc.append(roc1)
print(max(MLP_roc))
We will refit an estimator using the best found parameter {'max_iter': 100}
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(max_iter=100, random_state=47)
mlp.fit(X_train, y_train)
print('\n accuracy:')
print('---------------')
acc15=mlp.score(X_test, y_test)
print(acc15)
y_pred15 = mlp.predict(X_test)
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm15 = confusion_matrix(y_test, y_pred15)
cmDf15=pd.DataFrame(cm15, index=mlp.classes_, columns=mlp.classes_)
print('\nConfusion Matrix')
print('-------------------')
print(cmDf15)
print("\033[1m The result is telling us that we have: ",(cm15[0,0]+cm15[1,1]),"correct predictions.")
print("\033[1m The result is telling us that we have: ",(cm15[0,1]+cm15[1,0]),"incorrect predictions.")
print("\033[1m We have a total predictions of: ",(cm15.sum()))
from sklearn.metrics import classification_report
print('\nclassification_report')
print('------------------------')
print(classification_report(y_test, y_pred15))
per15=metrics.precision_score(y_test, y_pred15)
rec15=metrics.recall_score(y_test, y_pred15)
print("\033[1m Precision of the model:", "{:.2%}".format(per15))
print("\033[1m Recall of the model:", "{:.2%}".format(rec15))
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
roc15 = roc_auc_score(y_test, y_pred15)
fpr, tpr, thresholds = roc_curve(y_test, mlp.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Multi-layer Perceptron (area = %0.4f)' % roc15)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()
print("\033[1m The ROC AUC score using the Multi-layer Perceptron algorithm is:", "{:.4%}".format(roc15))
models = ['Decision Tree','Bagging Classification',
'Multi-layer Perceptron']
tests_roc = [roc4, roc13, roc15]
compare_models = pd.DataFrame({ "Algorithms": models, "ROC AUC": tests_roc})
compare_models.sort_values(by = "ROC AUC", ascending = False)
dt.fit(X_train, y_train)
print("\033[1m Accuracy of Decision Tree on test set:", "{:.4f}".format(dt.score(X_test, y_test)))
Cross validation attempts to avoid overfitting while still producing a prediction for each observation dataset. We are using 10-fold Cross-Validation to train our model.
from sklearn import model_selection
from sklearn.model_selection import cross_val_score
kfold = model_selection.KFold(n_splits=10, random_state=47)
modelCV = dt
scoring = 'accuracy'
results = model_selection.cross_val_score(modelCV, X_train, y_train, cv=kfold, scoring=scoring)
print("\033[1m 10-fold cross validation average accuracy:", "{:.4f}".format((results.mean())))
y_pred4 = dt.predict(X_test)
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm4 = confusion_matrix(y_test, y_pred4)
cmDf4=pd.DataFrame(cm4, index=dt.classes_, columns=dt.classes_)
print('\nConfusion Matrix')
print('-------------------')
print(cmDf4)
print('-------------------')
print("\033[1m The result is telling us that we have: ",(cm4[0,0]+cm4[1,1]),"correct predictions.")
print("\033[1m The result is telling us that we have: ",(cm4[0,1]+cm4[1,0]),"incorrect predictions.")
print("\033[1m We have a total predictions of: ",(cm4.sum()))
from sklearn.metrics import classification_report
print('\nclassification_report')
print('------------------------')
print(classification_report(y_test, y_pred4))
To quote from Scikit Learn:
The precision is the ratio tp / (tp + fp) where tp is the number of true positives and fp the number of false positives. The precision is intuitively the ability of the classifier to not label a sample as positive if it is negative.
The recall is the ratio tp / (tp + fn) where tp is the number of true positives and fn the number of false negatives. The recall is intuitively the ability of the classifier to find all the positive samples.
The F-beta score can be interpreted as a weighted harmonic mean of the precision and recall, where an F-beta score reaches its best value at 1 and worst score at 0.
The F-beta score weights the recall more than the precision by a factor of beta. beta = 1.0 means recall and precision are equally important.
The support is the number of occurrences of each class in y_test.
#calculate Accuracy, how often is the classifier correct?
print("\nAccuracy of the Decision Tree algorithm:", "{:.2%}".format(metrics.accuracy_score(y_test, y_pred4)))
print("Accuracy: Well, we got a classification rate of", "{:.2%}".format(metrics.accuracy_score(y_test, y_pred4)))
#calculate Precision
print("\nPrecision of the Decision Tree algorithm:", "{:.2%}".format(metrics.precision_score(y_test, y_pred4)))
print("Precision: Precision is about being precise, i.e., how precise our model is. In other words, we can say, when a model makes a prediction, how often it is correct. In our prediction case, when our model predict a loan is about to default, that loan actually defaulted", "{:.2%}".format(metrics.precision_score(y_test, y_pred4)) ,"of the time.")
#calculate Recall
print("\nRecall of the Decision Tree algorithm:", "{:.2%}".format(metrics.recall_score(y_test, y_pred4)))
print("Recall: If there is a loan that defaulted present in the test set, our model can identify it", "{:.2%}".format(metrics.recall_score(y_test, y_pred4)) ,"of the time.")
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
dt_roc_auc = roc_auc_score(y_test, dt.predict(X_test))
fpr, tpr, thresholds = roc_curve(y_test, dt.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label= 'Decision Tree (area = %0.4f)' % dt_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()
Now our model has been built, let me use it for real time predictions.
chosenModel = cols
chosenModel
df_loan_dummies['Probability_to_Default'] = dt.predict_proba(df_loan_dummies[chosenModel])[:,1]
df_loan_dummies["LOAN_ID"] = df['LOAN_ID']
df_loan_dummies["TRUE"]=df_loan_dummies["DEFAULT"]
df_loan_dummies["PREDICTED"]=dt.predict(X)
df_loan_dummies[["LOAN_ID","TRUE","PREDICTED", "Probability_to_Default"]].head(10)
df_loan_dummies.to_csv('Prob_to_Default.csv', index=False, encoding='utf-8')
a_1 = int(input("Please enter the loan amount without comma-separated (for example, 20000):"))
a_2 = float(input("Please enter the loan rate in percentage (for example, 17.93):"))
a_3 = float(input("Please enter the monthly payment (for example, 342.94):"))
a_4 = float(input("Please enter the annual income without comma-separated (for example, 344304):"))
a_5 = float(input("Please enter the debt to income ratio in in percentage (for example, 18.47):"))
a_6 = int(input("Does the loan term is 36 months (1 if yes, 0 otherwise)? (for example, 0):"))
a_7 = int(input("Does the loan term is 60 months (1 if yes, 0 otherwise)? (for example, 1):"))
a_8 = int(input("Does the loan purpose is for a car (1 if yes, 0 otherwise)? (for example, 0):"))
a_9 = int(input("Does the loan purpose is for a credit card (1 if yes, 0 otherwise)? (for example, 0):"))
a_10 = int(input("Does the loan purpose is for a debt consolidation (1 if yes, 0 otherwise)? (for example, 1):"))
a_11 = int(input("Does the loan purpose is for an education (1 if yes, 0 otherwise)? (for example, 0):"))
a_12 = int(input("Does the loan purpose is for a home improvement (1 if yes, 0 otherwise)? (for example, 0):"))
a_13 = int(input("Does the loan purpose is for a house (1 if yes, 0 otherwise)? (for example, 0):"))
a_14 = int(input("Does the loan purpose is for a major purchase (1 if yes, 0 otherwise)? (for example, 0):"))
a_15 = int(input("Does the loan purpose is for a medical treatment (1 if yes, 0 otherwise)? (for example, 0):"))
a_16 = int(input("Does the loan purpose is for moving (1 if yes, 0 otherwise)? (for example, 0):"))
a_17 = int(input("Does the loan purpose is for other purpose (1 if yes, 0 otherwise)? (for example, 0):"))
a_18 = int(input("Does the loan purpose is for a renewable_energy (1 if yes, 0 otherwise)? (for example, 0):"))
a_19 = int(input("Does the loan purpose is for a small_business (1 if yes, 0 otherwise)? (for example, 0):"))
a_20 = int(input("Does the loan purpose is for a vacation (1 if yes, 0 otherwise)? (for example, 0):"))
a_21 = int(input("Does the loan purpose is for a wedding (1 if yes, 0 otherwise)? (for example, 0):"))
a_22 = int(input("Does the loan applicant's employment length range between 1 to 2 years (1 if yes, 0 otherwise)? (for example, 1):"))
a_23 = int(input("Does the loan applicant's employment length range between 3 to 4 years (1 if yes, 0 otherwise)? (for example, 0):"))
a_24 = int(input("Does the loan applicant's employment length range between 5 to 6 years (1 if yes, 0 otherwise)? (for example, 0):"))
a_25 = int(input("Does the loan applicant's employment length range between 7 to 8 years (1 if yes, 0 otherwise)? (for example, 0):"))
a_26 = int(input("Does the loan applicant's employment length range between 9 to 10 years (1 if yes, 0 otherwise)? (for example, 0):"))
a_27 = int(input("Is the loan applicant's employment lower than 1 year (1 if yes, 0 otherwise)? (for example, 0):"))
a_28 = int(input("Is the loan applicant's employment higher than 10 years (1 if yes, 0 otherwise)? (for example, 0):"))
a_29 = int(input("Does the loan applicant rent a home (1 if yes, 0 otherwise)? (for example, 0):"))
a_30 = int(input("Does the loan applicant own a home (1 if yes, 0 otherwise)? (for example, 1):"))
new_data = np.array([a_1,a_2,a_3,a_4,a_5,a_6,a_7,a_8,a_9,a_10,a_11,a_12,a_13,a_14,a_15,
a_16,a_17,a_18,a_19,a_20,a_21,a_22,a_23,a_24,a_25,a_26,a_27,a_28,
a_29,a_30]).reshape(1,-1)
new_pred=dt.predict(new_data)
new_prob=dt.predict_proba(new_data)
int(new_pred[0])
if int(new_pred[0]) == 1:
print("\033[1m \nThe new loan is predicted to default (Don't give this applicant any money!!!!)\033[1m")
print("\033[1m \nThe default probability of this applicant is", "{:.4%}".format(max(new_prob[0])))
else:
print("\033[1m \nThe new loan is not predicted to default (continue checking this applicant)\033[1m")